CS224d Assignment 1

Q1, Q2 Softmax Layer Gradient Checking Neural network Sigmoid Layer


In [ ]:
import numpy as np
import random
import glob
import os.path as op
import cPickle as pickle

q1_softmax.py


In [ ]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    It is crucial that this function is optimized for speed because
    it will be used frequently in later code.
    You might find numpy functions np.exp, np.sum, np.reshape,
    np.max, and numpy broadcasting useful for this task. (numpy
    broadcasting documentation:
    http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)

    You should also make sure that your code works for one
    dimensional inputs (treat the vector as a row), you might find
    it helpful for your later problems.

    You must implement the optimization in problem 1(a) of the 
    written assignment!
    """

    ### YOUR CODE HERE
    
    # here we subtract max from each row to make the computation more robust, it doesn't affect overall answer
    
    if(len(x.shape)==1):
        x = (np.exp(x - np.max(x)))/(np.sum(np.exp(x - np.max(x))))
    else:
        rows = x.shape[0]
        x = (np.exp(x - np.max(x,axis=1).reshape(rows,1)))/(np.sum(np.exp(x - np.max(x,axis=1).reshape(rows,1)),axis=1).reshape((rows,1)))
    ### END YOUR CODE
    
    return x

q2_gradcheck.py


In [ ]:
def gradcheck_naive(f, x):
    """ 
    Gradient check for a function f 
    - f should be a function that takes a single argument and outputs the cost and its gradients
    - x is the point (numpy array) to check the gradient at
    """ 

    rndstate = random.getstate()
    random.setstate(rndstate)  
    fx, grad = f(x) # Evaluate function value at original point
    h = 1e-4

    # Iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index

        ### try modifying x[ix] with h defined above to compute numerical gradients
        ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it 
        ### possible to test cost functions with built in randomness later
        ### YOUR CODE HERE:
        
        old = x[ix]
        x[ix] = old + h
        random.setstate(rndstate)
        fxh1,grad1 = f(x)
        x[ix] = old - h
        random.setstate(rndstate)
        fxh2,grad2 = f(x)
        numgrad = (fxh1 - fxh2)/(2*h) # classic way of finding gradient for any function
        
        x[ix]=old
        ### END YOUR CODE

        # Compare gradients
        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
        if reldiff > 1e-5:
            print "Gradient check failed."
            print "First gradient error found at index %s" % str(ix)
            print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
            print "reldiff: %f Needed: %f" % (reldiff,1e-5)
            return
    
        it.iternext() # Step to next dimension
        

    print "Gradient check passed!"

q2_sigmoid.py


In [ ]:
def sigmoid(x):
    """
    Compute the sigmoid function for the input here.
    """
    
    ### YOUR CODE HERE
    x = 1.0/(1.0 + np.exp(-x))
    ### END YOUR CODE
    
    return x

def sigmoid_grad(f):
    """
    Compute the gradient for the sigmoid function here. Note that
    for this implementation, the input f should be the sigmoid
    function value of your original input x. 
    """
    
    ### YOUR CODE HERE
    f = f*(1-f)
    ### END YOUR CODE
    
    return f

q2_neural.py


In [ ]:
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
    # [10,5,10] N = 20
    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))   # W1 (10,5)
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))        # b1 (1,5)
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))  # W2 (5,10)
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))      # b2 (1,10)

    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(data,W1) + b1                           # data(20,10) * W1(10,5) + b1(1,5)= z1(20,5)
    h = sigmoid(z1)

    z2 = np.dot(h,W2) + b2                              # h(20,5) * W2 (5,10) + b2(1,10) = z2(20,10)
    y = softmax(z2)

    ### END YOUR CODE
    cost = np.sum(-np.log(y[labels==1]))   # We only need to add those balues of y corresponding to 1's in labels
    ### YOUR CODE HERE: backward propagation
    delta = y - labels
    gradb2 = np.sum(delta,axis=0)
    gradW2 = np.dot(h.transpose(),delta)

    delta1 = np.multiply(sigmoid_grad(h), np.dot(delta, W2.transpose()))
    gradb1 = np.sum(delta1,axis=0)
    gradW1 = np.dot(data.transpose(),delta1)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad

In [ ]:
SAVE_PARAMS_EVERY = 1000

q3_sgd.py


In [ ]:
def load_saved_params():
    """ A helper function that loads previously saved parameters and resets iteration start """
    st = 0
    for f in glob.glob("saved_params_*.npy"):
        iter = int(op.splitext(op.basename(f))[0].split("_")[2])
        if (iter > st):
            st = iter
            
    if st > 0:
        with open("saved_params_%d.npy" % st, "r") as f:
            params = pickle.load(f)
            state = pickle.load(f)
        return st, params, state
    else:
        return st, None, None
    
def save_params(iter, params):
    with open("saved_params_%d.npy" % iter, "w") as f:
        pickle.dump(params, f)
        pickle.dump(random.getstate(), f)

def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10):
    """ Stochastic Gradient Descent """
    # Implement the stochastic gradient descent method in this        
    # function.                                                       
    
    # Inputs:                                                         
    # - f: the function to optimize, it should take a single        
    #     argument and yield two outputs, a cost and the gradient  
    #     with respect to the arguments                            
    # - x0: the initial point to start SGD from                     
    # - step: the step size for SGD                                 
    # - iterations: total iterations to run SGD for                 
    # - postprocessing: postprocessing function for the parameters  
    #     if necessary. In the case of word2vec we will need to    
    #     normalize the word vectors to have unit length.          
    # - PRINT_EVERY: specifies every how many iterations to output  

    # Output:                                                         
    # - x: the parameter value after SGD finishes  
    
    # Anneal learning rate every several iterations
    ANNEAL_EVERY = 20000
    
    if useSaved:
        start_iter, oldx, state = load_saved_params()
        if start_iter > 0:
            x0 = oldx;
            step *= 0.5 ** (start_iter / ANNEAL_EVERY)
            
        if state:
            random.setstate(state)
    else:
        start_iter = 0
    
    x = x0
    
    if not postprocessing:
        postprocessing = lambda x: x
    
    expcost = None
    
    for iter in xrange(start_iter + 1, iterations + 1):
        ### Don't forget to apply the postprocessing after every iteration!
        ### You might want to print the progress every few iterations.

        cost = None
        ### YOUR CODE HERE
        cost, grad = f(x)
        x = x - step*grad
        x = postprocessing(x)
        ### END YOUR CODE
        
        if iter % PRINT_EVERY == 0:
            if not expcost:
                expcost = cost
            else:
                expcost = .95 * expcost + .05 * cost
            print "iter %d: %f" % (iter, expcost)
        
        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
            save_params(iter, x)
            
        if iter % ANNEAL_EVERY == 0:
            step *= 0.5
    
    return x

q3_word2vec.py


In [ ]:
def normalizeRows(x):
    """ Row normalization function """
    # Implement a function that normalizes each row of a matrix to have unit length
    
    ### YOUR CODE HERE
    x = x / (np.sqrt(np.sum(x**2, axis=1,keepdims=True)))
    ### END YOUR CODE
    
    return x